setwd("~/data/intermediate")

library(tidyverse)
library(anytime)
library(lme4)
library(lmerTest)
library(interactions)
library(stringr)
library(sentimentr)

#-------------Preprocessing data sets-------------#

# read in the data sets processed by LIWC

posts_liwc_2 <- read.csv("LIWC_22_Results_posts_2.csv", header = TRUE)
comments_liwc_2 <- read.csv("LIWC_22_Results_comments_2.csv", header = TRUE)

# Transform date

posts_liwc_2$time_category <- anytime(posts_liwc_2$created_utc)
posts_liwc_2$time_category <- anydate(posts_liwc_2$time_category)
posts_liwc_2$time_category <- format(as.Date(posts_liwc_2$time_category), "%Y-%m")
posts_liwc_2$time_category <- anydate(posts_liwc_2$time_category)

# delete the first two rows because they are in 2015

posts_liwc_2 = posts_liwc_2[-c(1,2),]

comments_liwc_2 <- filter(comments_liwc_2, link_id %in% posts_liwc_2$id)

#--------- Descriptive analyses ------------#


# ANOVA between analytical thinking scores of POSTS in (1) pro-vaccine group, (2) anti-vaccine group, (3) two-sided group, and (4) neutral group

anova_post <- aov(Analytic ~ message_category, data = posts_liwc_2)
summary(anova_post)
posts_analytic <- group_by(posts_liwc_2, message_category) %>%
  summarise(mean=mean(Analytic, na.rm = TRUE), sd=sd(Analytic, na.rm = TRUE))

tukey_anova_post <- TukeyHSD(anova_post)
tukey_anova_post


# visualization of analytical scores of POST in four groups across time

group_mean_posts <- posts_liwc_2 %>% 
                group_by(message_category, time_category) %>%
                summarise(analytic_mean = mean(Analytic, na.rm=TRUE))

ggplot(group_mean_posts, aes(x = time_category, y = analytic_mean, group = message_category)) +
    geom_line(aes(color = message_category)) +
    scale_color_manual(values = c("orange", "dark gray", "blue", "purple"),
                    labels = c("Antivaccine","Neutral", "Provaccine", "2-sided")) +
    labs(x = "Year",
         y = "Analytical thinking",
         colour = "Post stance")

# ANOVA between analytical thinking scores of COMMENTS in (1) pro-vaccine group, (2) anti-vaccine group, (3) two-sided group, and (4) neutral group

group_mean_comments_1 <- comments_liwc_2 %>%
  group_by(post_stance, time_category) %>%
  summarise(analytic_mean = mean(Analytic, na.rm=TRUE))

group_mean_comments_1$time_category <- anydate(group_mean_comments_1$time_category)

ggplot(group_mean_comments_1, aes(x = time_category, y = analytic_mean, group = post_stance)) +
  geom_line(aes(color = post_stance)) +
  scale_color_manual(values = c("orange", "dark gray", "blue", "purple"),
                     labels = c("Antivaccine","Neutral", "Provaccine", "2-sided")) +
  labs(x = "Year", 
       y = "Analytical thinking",
       colour = "Post stance")

comments_analytic <- group_by(comments_liwc_2, post_stance) %>%
  summarise(mean=mean(Analytic, na.rm = TRUE), sd=sd(Analytic, na.rm = TRUE))

anova_comments <- aov(Analytic ~ post_stance, data = comments_liwc_2)
summary(anova_comments)


# ANOVA between analytical thinking scores of COMMENTS of authors who are (1) pro-vaccine, (2) anti-vaccine, (3) mixed, and (4) uncertain

group_mean_comments_2 <- comments_liwc_2 %>%
  group_by(author_stance, time_category) %>%
  summarise(analytic_mean = mean(Analytic, na.rm=TRUE))

group_mean_comments_2$time_category <- anydate(group_mean_comments_2$time_category)

ggplot(group_mean_comments_2, aes(x = time_category, y = analytic_mean, group = author_stance)) +
  geom_line(aes(color = author_stance)) +
  scale_color_manual(values = c("orange", "dark gray", "blue", "purple"),
                     labels = c("Antivaccine","Neutral", "Provaccine", "2-sided")) +
  labs(x = "Year", 
       y = "Analytical thinking",
       colour = "Comment authors' stance")

comments_analytic_author <- group_by(comments_liwc_2, author_stance) %>%
  summarise(mean=mean(Analytic, na.rm = TRUE), sd=sd(Analytic, na.rm = TRUE))

anova_comments_author <- aov(Analytic ~ author_stance, data = comments_liwc_2)
summary(anova_comments_author)

#--------- Inferential analysis: analytical thinking ------------#

# filter out pro & anti posts AND pro & anti authors' comments 

comments_liwc_2 <- read.csv("data_for_analytical.csv", header = TRUE)

comments_for_analysis <- comments_liwc_2 %>%
  filter(author_stance == "pro_vaccine" | author_stance == "anti_vaccine") %>%
  filter(post_stance == "pro_vaccine" | post_stance == "anti_vaccine")

## count the number of comments and posts

length(unique(comments_for_analysis$id)) #n=14978 comments

length(unique(comments_for_analysis$link_id)) #n=7777 posts

## model0: null model to calculate ICC

### Function to compute the intraclass correlation coefficient (ICC)

compute_icc <- function(lmer_object){
  var_dat <- lmer_object %>% VarCorr %>% as.data.frame
  icc <- var_dat$vcov[1]/(var_dat$vcov[1]+var_dat$vcov[2])
  return(icc)
}

model0_post_author <- lmer(formula = Analytic ~ 1 + (1 | post_author),
               data = comments_for_analysis)

compute_icc(model0_post_author) %>%
  round(3) # ICC =0.114 post author

model0_post <- lmer(formula = Analytic ~ 1 + (1 | link_id),
                           data = comments_for_analysis)

compute_icc(model0_post) %>%
  round(3) # ICC =0.126 post

model0_comment_author <- lmer(formula = Analytic ~ 1 + (1 | author),
                    data = comments_for_analysis)

compute_icc(model0_comment_author) %>%
  round(3) # ICC =0.181 comment author

# small ICC means higher variability within the clusters and consequently lower variability between the clusters. 

## FINAL SELECTED: model2: clusters include posts, comment authors, and post authors

model2 <- lmer(formula = Analytic ~ 1 + author_stance + post_stance + post_analytic + post_stance:author_stance + 
                 (1 | link_id) + (1 | author) + (1 | post_author),
               data = comments_for_analysis, REML = TRUE)
summary(model2)

## Additional analysis: add subreddit as a confounding variable

model2_add <- lmer(formula = Analytic ~ 1 + author_stance + post_stance + post_analytic + post_stance:author_stance + 
                 (1 | link_id) + (1 | author) + (1 | post_author) + (1 | subreddit),
               data = comments_for_analysis, REML = TRUE)
summary(model2_add)

# plotting #

cat_plot(model2, pred=post_stance, modx=author_stance, geom = "line", interval=TRUE, y.label = "Analytical thinking", x.label = "Post stance", legend.main = "Comment authors' stance", pred.labels = c("Antivaccine", "Provaccine"), modx.labels = c("Antivaccine", "Provaccine"))

#-------------Inferential analysis: homophily using chi-square------------------#

# read in data set for selective exposure

for_selective_exposure <- read.csv("data_selective_exposure2.csv", header = TRUE)

length(unique(for_selective_exposure$link_id)) # posts = 13899
length(unique(for_selective_exposure$id)) # comments = 89347
length(unique(for_selective_exposure$author))

chi_square <- chisq.test(table(for_selective_exposure$post_stance, for_selective_exposure$author_stance))
chi_square
table(for_selective_exposure$author_stance, for_selective_exposure$post_stance)

# Additional analysis: filter out top subreddits and rerun the analyses

for_selective_exposure <- read.csv("data_selective_exposure2.csv", header = TRUE)

count_subreddits <- as.data.frame(table(for_selective_exposure$subreddit)) #conspiracy n= 13649, vaxxhappened n= 12400

## filter out top 2 subreddits: conspiracy & vaxxhappened

for_selective_exposure_subreddit <- for_selective_exposure %>%
  filter(str_detect(subreddit, 'conspiracy|vaxxhappened')) %>%
  filter(!grepl("conspiracy_commons", subreddit) & !grepl("conspiracytheorie", subreddit) & !grepl("conspiracyundone", subreddit))

chi_square_subreddit <- chisq.test(table(for_selective_exposure_subreddit$post_stance, for_selective_exposure_subreddit$author_stance))
chi_square_subreddit
table(for_selective_exposure_subreddit$author_stance, for_selective_exposure_subreddit$post_stance)

length(unique(for_selective_exposure_subreddit$id)) # comments = 26032
table(for_selective_exposure_subreddit$author_stance) # anti = 12358, pro = 13691

#----------Inferential analysis: language coordination-------------#

anti_anti <- read.csv("anti_to_anti.csv", header = TRUE)
anti_anti$speaker <- "anti"
anti_anti$target <- "anti"

anti_pro <- read.csv("anti_to_pro.csv", header = TRUE)
anti_pro$speaker <- "anti"
anti_pro$target <- "pro"

pro_anti <- read.csv("pro_to_anti.csv", header = TRUE)
pro_anti$speaker <- "pro"
pro_anti$target <- "anti"

pro_pro <- read.csv("pro_to_pro.csv", header = TRUE)
pro_pro$speaker <- "pro"
pro_pro$target <- "pro"

all_data <- rbind(anti_anti, anti_pro, pro_anti, pro_pro)

# plotting

library(ggplot2)

for_plot <- all_data %>%
  group_by(target, speaker) %>%
  summarise(mean_score = mean(average_score, na.rm = TRUE),
            se = sd(average_score)/sqrt(length(average_score)))

bar_plot <- ggplot(for_plot, aes(x = target, y = mean_score, fill = factor(speaker))) +
            geom_bar(stat = "identity",
                     position = position_dodge(width = 0.9)) +
            scale_fill_manual(values = c("blue", "orange"),
                              labels = c("Antivaccine", "Provaccine")) +
            geom_errorbar(aes(ymax = mean_score + se,
                              ymin = mean_score - se),
                              position = position_dodge(width = 0.9),
                              width = 0.25) +
            labs(x = "Stance of posts or comments being replied to",
                 y = "Language coordination",
                 fill = "Comment authors' stance") +
            scale_x_discrete(labels=c("anti" = "Antivaccine", "pro" = "Provaccine"))
print(bar_plot)

# check normality

library(ggpubr)

ggdensity(anti_anti$average_score, 
          main = "Density plot of average score",
          xlab = "average score")
ggqqplot(anti_anti$average_score)
shapiro.test(anti_anti$average_score) #significant --> not normal, i.e., normality assumption not met

ggdensity(pro_anti$average_score, 
          main = "Density plot of average score",
          xlab = "average score")
ggqqplot(pro_anti$average_score)
shapiro.test(pro_anti$average_score) #not normal, i.e., normality assumption not met

# check equal variance

to_anti <- rbind(anti_anti, pro_anti)
res.ftest <- var.test(average_score ~ speaker, data = to_anti)
res.ftest #p=.88 there is no significant difference between the variances of the two sets of data. 
#Therefore, we can use the classic t-test witch assume equality of the two variances.

t.test(average_score ~ speaker, data = to_anti, paired=FALSE, var.equal = TRUE) 
wilcox.test(anti_anti$average_score, pro_anti$average_score, alternative = "two.sided")

t.test(anti_anti$average_score, pro_anti$average_score, paired=FALSE, var.equal = FALSE)

summary(anti_anti$average_score) #m = 0.0017
sd(anti_anti$average_score) #sd = 0.0195

summary(pro_anti$average_score) #m = 0.0055
sd(pro_anti$average_score) #sd = 0.0351

t.test(anti_pro$average_score, pro_pro$average_score, paired=FALSE, var.equal = FALSE)

summary(anti_pro$average_score) #m = 0.0033
sd(anti_pro$average_score) #sd = 0.0249

summary(pro_pro$average_score) #m = 0.0006
sd(pro_pro$average_score) #sd = 0.0185

# test only use participants who reply to both.

intersect_anti <- intersect(anti_anti$speaker_id, anti_pro$speaker_id)

intersect_pro <- intersect(pro_anti$speaker_id, pro_pro$speaker_id)

for_anti1 <- anti_anti %>%
  filter(speaker_id %in% intersect_anti)
for_anti2 <- anti_pro %>%
  filter(speaker_id %in% intersect_anti)

t.test(for_anti2$average_score, for_anti1$average_score, paired = TRUE, alternative = "two.sided") #p=.48

summary(for_anti1$average_score) #m = 0.0023
sd(for_anti1$average_score) #sd = 0.0215

summary(for_anti2$average_score) #m = 0.0038
sd(for_anti2$average_score) #sd = 0.0232

for_pro1 <- filter(pro_anti, speaker_id %in% intersect_pro)
for_pro2 <- filter(pro_pro, speaker_id %in% intersect_pro)

t.test(for_pro1$average_score, for_pro2$average_score, paired = TRUE, alternative = "two.sided") #p=.03

summary(for_pro1$average_score) #m = 0.0078
sd(for_pro1$average_score) #sd = 0.0375

summary(for_pro2$average_score) #m = 0.0029
sd(for_pro2$average_score) #sd = 0.0203

## plotting

all_data_bridges <- rbind(for_anti1, for_anti2, for_pro1, for_pro2)

for_plot_bridges <- all_data_bridges %>%
  group_by(target, speaker) %>%
  summarise(mean_score = mean(average_score, na.rm = TRUE),
            se = sd(average_score)/sqrt(length(average_score)))

bar_plot_bridges <- ggplot(for_plot_bridges, aes(x = speaker, y = mean_score, fill = factor(target))) +
  geom_bar(stat = "identity",
           position = position_dodge(width = 0.9)) +
  scale_fill_manual(values = c("blue", "orange"),
                    labels = c("Antivaccine", "Provaccine")) +
  geom_errorbar(aes(ymax = mean_score + se,
                    ymin = mean_score - se),
                position = position_dodge(width = 0.9),
                width = 0.25) +
  labs(x = "Stance of cultural bridges",
       y = "Language coordination",
       fill = "Stance of Redditors that \ncultural bridges reply to") +
  scale_x_discrete(labels=c("anti" = "Antivaccine", "pro" = "Provaccine"))
print(bar_plot_bridges)

# Additional analysis: filter out top subreddits and rerun the analyses
